---
title: "Main Model Code for 'When Corporations Are People' (SMR)"
output: 
  html_document:
    toc: yes
    toc_float: true
    number_sections: true
---

Replication materials for Knight's "When Corporations Are People", SMR

# Set up 
```{r setup, include=FALSE}
rm(list = ls())

library(stm)
library(tidyverse)
library(tidytext)
library(dplyr)
library(stringr)
library(stats)
library(textstem)
library(quanteda)
library(fastDummies)

library(ggplot2)
library(gridExtra)
library(emmeans)
library(cowplot)
library(scales)
```

# Load data 

Load main datasets:
(1) Agency_files.Rdata: File-level (Nrow: 435,655, ncol: 29)
(2) Agency_entities.Rdata: Entity-level (Nrow: 868,183, ncol: 13)
(3) stm12.Rdata: STM output file

```{r, echo=FALSE}
load("agency_files.Rdata")
load("agency_entities.Rdata")
load("stm12.Rdata")
```

#Figures and Tables 

## Figure 1: Number of Files Over Time

Code snippet for Figure 1, number of files over time 

```{r time, echo=FALSE}
summary<-agency_files %>%
  group_by(Pubname, Year)%>%
  summarize(N = n(), .groups = 'drop')

#Figure 1
summary %>%
  ggplot(aes(x=Year, y = N, group = Pubname)) +
  geom_col(aes(fill = Pubname))+
  ylab("Article Count")+
  xlab("")+
  scale_fill_manual(values = c("darkgrey", "black"))+
  scale_y_continuous(label=comma)+
  theme_minimal()+
  theme(legend.title = element_blank(), legend.position = "bottom")

```

## Table 3: File-level descriptives for context analysis 

Descriptive statistics for Table 3

```{r}
# Generate dummies
files_desc<- agency_files %>%
  select(dummy_mental_recoded, dummy_comm_recoded, N, fulltext_neg_sent_score, fulltext_pos_sent_score, fulltext_count, spellcorrect, Article_dummies, Publication) %>%
  dummy_cols(., select_columns = 'Article_dummies')%>%
  select(-Article_dummies)

t(files_desc%>% group_by(Publication)%>%
  summarise_all(.funs = c(mean="mean", sd = "sd", min = "min", max = "max")))

nrow(agency_files)
#435655
nrow(agency_files %>% filter(Publication == "New York Times Company"))
#337546
nrow(agency_files %>% filter(Publication == "Dow Jones & Company Inc"))
#98109
```

## Table 4: Top Agent Talk Elements

Uses the dataset "agency_entities.Rdata" to show frequent subject-verb relations 

```{r, echo=FALSE}
#All entities: Action
agency_entities %>%
  filter(Deptype =="nsubj")%>%
  mutate(subject_action = paste(Entity_lemma, lemma, sep = "_"))%>%
  count(subject_action) %>%    
  mutate(prop = prop.table(n))%>%
  arrange(-n)%>%
  top_n(15)

#Named entities: Action
agency_entities %>%
  filter(Deptype =="nsubj" & NamedEntity ==1)%>%
  mutate(subject_action = paste(Entity_lemma, lemma, sep = "_"))%>%
  count(subject_action) %>%    
  mutate(prop = prop.table(n))%>%
  arrange(-n)%>%
  top_n(15)

#Passive action
agency_entities %>%
  filter(Deptype =="nsubj:pass")%>%
  mutate(subject_action = paste(Entity_lemma, Verb, sep = "_"))%>%
  count(subject_action) %>%    
  mutate(prop = prop.table(n))%>%
  arrange(-n)%>%
  top_n(15)

#All Intentionality
agency_entities %>%
  filter(Deptype =="nsubj" &  mental_recoded ==1)%>%
  mutate(subject_action = paste(Entity_lemma, lemma, sep = "_"))%>%
  count(subject_action) %>%    
  mutate(prop = prop.table(n))%>%
  arrange(-n)%>%
  top_n(15)

#All Speech
agency_entities %>%
  filter(Deptype =="nsubj" &  comm_recoded ==1)%>%
  mutate(subject_action = paste(Entity_lemma, lemma, sep = "_"))%>%
  count(subject_action) %>%    
  mutate(prop = prop.table(n))%>%
  arrange(-n)%>%
  top_n(15)

```

## Figure 3: Agent Talk Over Time

Uses the dataset "agency_entities.Rdata" to show descriptives on agent talk discourse, by article, over time. 

Panel A: Count of articles with any active or passive action 
Panel B: Percent of articles with any intentionality or communication 

```{r, echo=FALSE}

#############################################################
#Panel A: Count of articles with any active or passive action 
############################################################

summary<-agency_entities %>%
  group_by(Pubname, Year, Deptype)%>%
  summarize(N = n(), .groups = 'drop') %>%
  mutate(Deptype = recode(Deptype, `nsubj` = "Active subject",
                          `nsubj:pass` = "Passive subject"))

p1<- summary %>%
  ggplot(aes(x=Year, y = N, group = Pubname)) +
  geom_line(aes(linetype = Pubname))+
  ylab("Count, Active and Passive Subjects")+
  xlab("")+
  scale_fill_manual(values = c("darkgrey", "black"))+
  scale_y_continuous(label=comma)+
  theme_minimal()+
  facet_grid(~Deptype)+
  theme(axis.text.x=element_text(angle=45, hjust =1),
        axis.title.y = element_text(size = 9))


###############################################################
#Panel B: Percent of articles with any intentionality or communication 
##############################################################

#Summarize by Article
articles <- agency_entities %>%
  filter(Deptype == "nsubj")%>%
  group_by(Year, Pubname, Filename)%>%
  select(Filename, Pubname, Year, mental_recoded, comm_recoded)%>%
  summarise_each(funs(max(., na.rm=T)))

#articles a year with either
articles_long <- articles %>%
   gather(variable, value, mental_recoded:comm_recoded)

articles_with_agency<- articles_long %>%
  group_by(Year, Pubname, variable)%>%
  summarise(mean = mean(value, na.rm =T), 
            se = sd(value)/sqrt(length(value)))%>%
  mutate(variable = recode(variable, `comm_recoded` = "Organizational speech",
                          `mental_recoded` = "Organizational Intentionality"))
            
p2<-articles_with_agency %>%
  ggplot(aes(x=Year, y = mean, group = Pubname)) +
  geom_line(aes(linetype=Pubname))+
  geom_ribbon(aes(ymin=mean- se,  ymax=mean+ se), alpha = .3) +
  facet_grid(~variable, scales = "free_y")+
  ylab("Verb Frequency (%)")+
  xlab("")+
  theme_minimal()+
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.text.x=element_text(angle=45, hjust =1), 
         axis.title.y = element_text(size = 9))


############
#Combine
##############

plot.row<- plot_grid(p1 +theme(legend.position="none"),
             p2 + theme(legend.position="none"),
             labels = c('A', 'B'), 
             align = "vh")

legend <-get_legend(p2) 

p<- plot_grid(plot.row, legend, ncol=1,
               rel_heights = c(1, .1))
              
p

```

















## Figure 4: Topic Model Frex Words
```{r}
#number of topics
ntopic = 12

#word scores
td_beta <- tidy(stm12)

td_beta<-td_beta %>%
  filter(!term == "")

#get frex words
labels<- labelTopics(stm12, seq(1,ntopic), n=7)

df <- data.frame(matrix(unlist(labels$frex), nrow=7, byrow=T))

df<-as_tibble(df)

df1 <-df %>%
  gather(key = "topic", value = "term")%>%
  mutate(topic = str_remove(topic, "X")) %>%
  mutate(topic = as.numeric(topic))

##########
#Visualize
##########

df2<-df1 %>% 
  left_join(as_tibble(td_beta), by = c('topic', 'term') ) %>%
    mutate(topic = recode(topic, `1` = "Business opinion", 
          `2` = "International news", 
         `3` = "Business outlook",
         `4` = "Oil + gas",
         `5` = "Politics",
         `6` = "Banking",
         `7` = "Stocks",
         `8` = "Regulation",
         `9` = "Local organizations",
         `10` = "Railroads",
         `11` = "Courts",
         `12` = "Labor")) %>%
  mutate(topic = factor(topic, levels = c("Banking", "Business outlook", "Oil + gas", "Railroads", "Stocks", "Courts", "Labor","Politics","Regulation", "Business opinion", "International news", "Local organizations")))%>%
  mutate(topiccolor = ifelse(topic %in%  c("Labor", "Regulation", "Courts", "Politics"), 0, ifelse(topic %in% c("Local organizations", "International news", "Business opinion"), -1, 2)))


df2 %>%
  group_by(topic, term) %>%  
  arrange(desc(beta)) %>%
  ungroup() %>%
  mutate(term = factor(paste(term, topic, sep = "__"), levels = rev(paste(term, topic, sep = "__")))) %>% 
  ggplot(aes(term, beta, fill = as.factor(topiccolor))) +
  geom_col(alpha = 0.8, show.legend = FALSE, color = "black", size =.2) +
  facet_wrap(~ topic, scales = "free") +
  #scale_fill_manual(values = c("white", "lightgrey", "grey", "darkgrey", "black"))+
  theme_minimal()+
  coord_flip() +
  ylab("Frequency")+
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x))+
  scale_fill_grey(start = 1, end = 0)+
  labs(x = NULL, y = expression(beta))+
     theme(axis.text.x=element_text(angle=45, hjust =1))

```


## Figure 5: Topic Model Regression 

```{r}

##############
# Any Agency
##############

#main formula + outputs
f.a<- agency~ factor(Publication) + spellcorrect + N + fulltext_count + fulltext_neg_sent_score + fulltext_pos_sent_score + factor(Article_dummies) + factor(Year) + factor(dayofweek)
outputa<-matrix(data=NA,nrow=ntopic,ncol=3)
fullresultsa<-c()

#run analysis
for (i in 1:ntopic){
  topic.star <-paste0('topic', i)
  model<-update(f.a,paste("~ . +", topic.star))
  results <- glm(model, family = binomial(link = 'logit'), data= agency_files)
  outputa[i, ] <-c(topic.star, as.numeric(summary(results)$coefficients[, 1][topic.star]), as.numeric(summary(results)$coefficients[, 2][topic.star]))
  fullresultsa[[i]]<-results
}

#save output for visualization
outputa<-as_tibble(outputa)
colnames(outputa)<-c("topic", "coef", "se")
outputa$se<-as.numeric(outputa$se)
outputa$coef<-as.numeric(outputa$coef)

##############
#Mental states
##############

#main formula + outputs
f.m<- dummy_mental_recoded~ factor(Publication) + spellcorrect + N + fulltext_count + fulltext_neg_sent_score + fulltext_pos_sent_score + factor(Article_dummies) + factor(Year) + factor(dayofweek)
outputm<-matrix(data=NA,nrow=ntopic,ncol=3)
fullresultsm<-c()

#loop thorugh model
for (i in 1:ntopic){
  topic.star <-paste0('topic', i)
  model<-update(f.m,paste("~ . +", topic.star))
  results <- glm(model, family = binomial(link = 'logit'), data= agency_files)
  outputm[i, ] <-c(topic.star, as.numeric(summary(results)$coefficients[, 1][topic.star]), as.numeric(summary(results)$coefficients[, 2][topic.star]))
  fullresultsm[[i]]<-results

}

#prepare data for visualization
outputm<-as_tibble(outputm)
colnames(outputm)<-c("topic", "coef", "se")
outputm$se<-as.numeric(outputm$se)
outputm$coef<-as.numeric(outputm$coef)

##############
#Communication
##############

#main formula + outputs
f.c<- dummy_comm_recoded~ factor(Publication) + spellcorrect + N + fulltext_count + fulltext_neg_sent_score + fulltext_pos_sent_score +  factor(Article_dummies) + factor(Year) + factor(dayofweek)
outputc<-matrix(data=NA,nrow=ntopic,ncol=3)
fullresultsc<-c()

#loop through
for (i in 1:ntopic){
  topic.star <-paste0('topic', i)
  model<-update(f.c,paste("~ . +", topic.star))
  results <- glm(model, family = binomial(link = 'logit'), data= agency_files)
  outputc[i, ] <-c(topic.star, as.numeric(summary(results)$coefficients[, 1][topic.star]), as.numeric(summary(results)$coefficients[, 2][topic.star]))
  fullresultsc[[i]]<-results

}

#save
outputc<-as_tibble(outputc)
colnames(outputc)<-c("topic", "coef", "se")
outputc$se<-as.numeric(outputc$se)
outputc$coef<-as.numeric(outputc$coef)


#######################
#Combine + Visualize
#######################

topics<- paste0("topic", seq(1:ntopic))
outputc$model<- "Speech verb"
outputm$model<- "Intentionality verb"
outputa$model<- "Any agentic verb"

combined_output<-rbind(outputm, outputc, outputa)

combined_output<- combined_output %>%
    mutate(term = recode(topic, `topic1` = "Business opinion", 
          `topic2` = "International news", 
         `topic3` = "Business outlook",
         `topic4` = "Oil + gas",
         `topic5` = "Politics",
         `topic6` = "Banking",
         `topic7` = "Stocks",
         `topic8` = "Regulation",
         `topic9` = "Local organizations",
         `topic10` = "Railroads",
         `topic11` = "Courts",
         `topic12` = "Labor")) 

ordered_topic<- combined_output %>% 
  filter(model == "Any agentic verb") %>%
  arrange(-coef)%>%
  pull(term)

combined_output %>%
  mutate(term = ordered(term, levels = rev(ordered_topic)))%>%
  ggplot(aes(x= term, y= coef, shape = model, color = model))+
  geom_point(position = position_dodge(0.5))+
  geom_errorbar(aes(ymin = coef- 2*se, ymax = coef+2*se), width = 0, position = position_dodge(0.5) )+
  coord_flip()+
  geom_hline(yintercept=0, color = "black", linetype = 2)+
  ylab("")+
  xlab("Coefficient estimate")+
  scale_shape_manual(breaks = c("Speech verb", "Intentionality verb", "Any agentic verb"), values=c(15,16, 17) )+
  scale_colour_grey(breaks = c("Speech verb", "Intentionality verb", "Any agentic verb"), start = .1, end = .8 )+
  theme_minimal()+
  theme(legend.title = element_blank(), legend.position="right", 
        legend.background =element_blank(),
        legend.text=element_text(size=8),
        legend.key.size = unit(0.5, "cm"),
        axis.title.x =  element_text(size =8))


```


## Figure 6: Time Trend Analysis

```{r}

#Set up lists to store output
results_list<- c()
outputlist<-c()

#Main model
main.model<- dummy_comm_recoded~ factor(Publication) +N +spellcorrect +fulltext_count + fulltext_pos_sent_score +fulltext_neg_sent_score +  factor(Article_dummies) + factor(Year) + factor(dayofweek) + Year*keytopic

#Iterate over topics
for (i in 1:ntopic){
  topic.star <-paste0('topic', i)
  datastar<- agency_files %>% mutate(keytopic = eval(as.name(topic.star)))
  results <- glm(main.model, family = binomial(link = 'logit'), data= datastar)
  mylist <- list(Years=seq(1890,1934), keytopic=c(1))
  margins<-emmip(results,keytopic~Year,at=mylist, CIs=TRUE, type = "response", plotit=FALSE)
  margins$topic<- topic.star
  outputlist[[i]] <- margins
  results_list[[i]] <- results
}

#combine outputs
output_combined<- bind_rows(outputlist, .id = "column_label")
  
#order by coefficient size
ordered_topic<- output_combined %>% 
  group_by(topic)%>%
  summarize(max_coef = max(yvar))%>%
  arrange(desc(max_coef))%>%
  pull(topic)

#Plot

output_combined%>%
  mutate(topic = factor(topic, levels = ordered_topic))%>%
    mutate(term = recode(topic, `topic1` = "Business opinion", 
          `topic2` = "International news", 
         `topic3` = "Business outlook",
         `topic4` = "Oil + gas",
         `topic5` = "Politics",
         `topic6` = "Banking",
         `topic7` = "Stocks",
         `topic8` = "Regulation",
         `topic9` = "Local\n organizations",
         `topic10` = "Railroads",
         `topic11` = "Courts",
         `topic12` = "Labor")) %>%
  ggplot(aes(x=Year, y=yvar))+
  geom_point(size =.5)+
  #geom_smooth(method = "lm", color = "blue")+
  geom_errorbar(aes(ymin =LCL, ymax = UCL), width =0)+
  geom_hline(yintercept=0, color = "black")+
  theme_minimal()+
  ylab("Predicted probability")+
  xlab("")+
  theme_minimal()+
 theme(axis.text.x=element_text(angle=45, hjust =1))+
  facet_wrap(~term, nrow=2)


```




